找传奇、传世资源到传世资源站!

scrapy抓取安居客数据

8.5玩家评分(1人评分)
下载后可评
介绍 评论 失效链接反馈

采用python scrapy抓取安居客数据
from clipboard
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy import log
import re
from urllib import request
from fang.items import AnJuKeItem
import json
import time
import os

class AnJuKeMobileSpider(CrawlSpider):
    name = "anjukemobile"
    allowed_domains = ['m.anjuke.com']
    start_urls = ['https://m.anjuke.com/sh/loupan/newajax/all/?q=&lat=0&lng=0&page=1']
    rules = (Rule(LinkExtractor(allow='/loupan/newajax/all/?q=&lat=0&lng=0&page=\d '), follow=True),
             # Rule(LinkExtractor(allow='/sh/loupan/p\d '), follow=True),
             Rule(LinkExtractor(allow='/sh/loupan/\d /$'), callback="parse_main", follow=True),
             Rule(LinkExtractor(allow='/sh/loupan/\d /params/'), callback="parse_params"),
             Rule(LinkExtractor(allow='/sh/loupan/\d /xiangce/\d /$'), callback="parse_image"),
             )

    def parse_main(self, response):
        log.msg(('down load url %s' % response.url), level=log.INFO)
        print(response.url)
        id = re.sub("\D", "", response.url)
        try:
            item = AnJuKeItem()
            status = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
                                    'em/text()').extract()
            if status and "已售罄" not in status[0]:
                item['status'] = status[0]
                projects = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lptitle"]/span/'
                                          'h1/text()').extract()
                item['project_name'] = projects[0]
                address = response.xpath('//*[@id="container"]/div[@class="lpbase"]/div[@class="lpinfo"]/'
                                         'a/p[@class="g-overflow-third"]//text()').extract()
                item['address'] = address[0].strip().replace('\xa0', '')
                item['province'] = '上海市'
                item['city'] = '上海市'
                item['district'] = address[0].strip().replace('\xa0', '').split('-')[0]

                print(projects[0])
                print(item)
        except Exception as error:
            log.msg(error, level=log.ERROR)

    def parse_params(self, response):
        log.msg(('down load param %s' % response.url), level=log.INFO)
        try:
            item = AnJuKeItem()
            status = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[2]/span'
                                    '/text()').extract()
            if status and "已售罄" not in status[0]:
                item['status'] = status[0]
                project_name = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/h3[1]/text()').extract()
                item['project_name'] = project_name[0]
                delivery_time = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul/li[4]/span'
                                               '/text()').extract()
                item['delivery_time'] = delivery_time[0]
                item['unit_price'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[2]/li[1]/span'
                                                    '/text()').extract()[0]
                renovation = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/label'
                                            '/text()').extract()[0]
                if "装修标准" in renovation:
                    item['renovation'] = response.xpath('//*[@id="main-page"]/div[@class="pcontainer"]/ul[3]/li[3]/span'
                                                        '/text()').extract()[0]
                else:
                    item['renovation'] = None
                print(item)
        except Exception as error:
            log.msg(response.url, level=log.ERROR)
        print(response.url)

评论

发表评论必须先登陆, 您可以 登陆 或者 注册新账号 !


在线咨询: 问题反馈
客服QQ:174666394

有问题请留言,看到后及时答复